This section will read in the data. And print a summary
The following are the mapping according to the data dictionary. season_mapping = {1:“winter”, 2:“spring”, 3:“summer”, 4:“fall”} yr_mapping = {0: “2011”, 1:“2012”} mnth_mapping = {1:“Jan”,2:“Feb”,3:“Mar”,4:“Apr”,5:“May”,6:“Jun”,7:“Jul”,8:“Aug”,9:“Sep”,10:“Oct”,11:“Nov”,12:“Dec”} weekday_mapping={0: ‘Sunday’, 1: ‘Monday’, 2: ‘Tuesday’,3: ‘Wednesday’, 4: ‘Thursday’, 5: ‘Friday’, 6: ‘Saturday’} weather_mapping={1:“Clear”,2:“Misty”,3:“Light Snow/Rain”,4:“Heavy Snow/Rain”}
library(tidyverse)
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
bikesharing <- read_csv("./Data/hour.csv")
## Parsed with column specification:
## cols(
## instant = col_double(),
## dteday = col_date(format = ""),
## season = col_double(),
## yr = col_double(),
## mnth = col_double(),
## hr = col_double(),
## holiday = col_double(),
## weekday = col_double(),
## workingday = col_double(),
## weathersit = col_double(),
## temp = col_double(),
## atemp = col_double(),
## hum = col_double(),
## windspeed = col_double(),
## casual = col_double(),
## registered = col_double(),
## cnt = col_double()
## )
# yr_mapping = {0: "2011", 1:"2012"}
# mnth_mapping = {1:"Jan",2:"Feb",3:"Mar",4:"Apr",5:"May",6:"Jun",7:"Jul",8:"Aug",9:"Sep",10:"Oct",11:"Nov",12:"Dec"}
# weekday_mapping={0: 'Sunday', 1: 'Monday', 2: 'Tuesday',3: 'Wednesday', 4: 'Thursday', 5: 'Friday', 6: 'Saturday'}
# weather_mapping={1:"Clear",2:"Misty",3:"Light Snow/Rain",4:"Heavy Snow/Rain"}
bikesharing %<>% mutate(
season=case_when(
season==1 ~ "winter",
season==2 ~ "spring",
season==3 ~ "summer",
season==4 ~ "fall",
TRUE~"NA" ),
yr = case_when(
yr==0 ~"2011",
yr==1 ~ "2012"
),
mnth = case_when(
mnth==1 ~"Jan",
mnth==2 ~"Feb",
mnth==3 ~"Mar",
mnth==4 ~"Apr",
mnth==5 ~"May",
mnth==6 ~"Jun",
mnth==7 ~"Jul",
mnth==8 ~"Aug",
mnth==9 ~"Sep",
mnth==10 ~"Oct",
mnth==11 ~"Nov",
mnth==12 ~"Dec",
),
weekday= case_when(
weekday==0 ~ "Sunday",
weekday==1 ~ "Monday",
weekday==2 ~ "Tuesday",
weekday==3 ~ "Wednesday",
weekday==4 ~ "Thursday",
weekday==5 ~ "Friday",
weekday==6 ~ "Saturday"
),
weathersit= case_when(
weathersit==1 ~"Clear",
weathersit==2 ~"Misty",
weathersit==3 ~"Light Snow or Rain",
weathersit==4 ~"Heavy Snow or Rain"
)
)
bikesharing %>% slice_sample(n=200)
## # A tibble: 200 x 17
## instant dteday season yr mnth hr holiday weekday workingday
## <dbl> <date> <chr> <chr> <chr> <dbl> <dbl> <chr> <dbl>
## 1 1300 2011-02-27 winter 2011 Feb 8 0 Sunday 0
## 2 7159 2011-10-30 fall 2011 Oct 21 0 Sunday 0
## 3 9105 2012-01-20 winter 2012 Jan 6 0 Friday 1
## 4 7883 2011-11-30 fall 2011 Nov 2 0 Wednes… 1
## 5 17314 2012-12-29 winter 2012 Dec 6 0 Saturd… 0
## 6 15788 2012-10-25 fall 2012 Oct 0 0 Thursd… 1
## 7 6277 2011-09-24 fall 2011 Sep 2 0 Saturd… 0
## 8 5507 2011-08-22 summer 2011 Aug 8 0 Monday 1
## 9 13767 2012-08-01 summer 2012 Aug 19 0 Wednes… 1
## 10 5437 2011-08-19 summer 2011 Aug 10 0 Friday 1
## # … with 190 more rows, and 8 more variables: weathersit <chr>, temp <dbl>,
## # atemp <dbl>, hum <dbl>, windspeed <dbl>, casual <dbl>, registered <dbl>,
## # cnt <dbl>
We will plot the distribution plot of the registered and casual riders all up
bikeshare_plot<-bikesharing %>% pivot_longer(cols = c(registered,casual),names_to="RideType",values_to="NumRides") %>% select(RideType,NumRides)
fig <- ggplot(data = bikeshare_plot) +
geom_density(mapping = aes(x=NumRides,fill=RideType),alpha=0.25) +
scale_y_continuous(labels=function(n){format(n, scientific = FALSE)})+
ggtitle("Distribution of Riders",subtitle = "Casual Vs Registered") +
xlab("Number of Rides")+
ylab("Density")+
theme_classic() +
theme(
legend.position = c(.95, .95),
legend.justification = c("right", "top"),
legend.box.just = "right",
legend.margin = margin(6, 6, 6, 6)
) +
theme(legend.title = element_blank()) +
theme(plot.title = element_text(hjust = .5)) +
theme(plot.subtitle = element_text(hjust = .5))
plot(x = fig)
ggsave("./figs/rides_distributions.png",plot = fig,device = "png")
rm(bikeshare_plot)
Plot the time series. Evolution of Rides by time.
plot_data<- bikesharing %>% select(dteday,registered,casual) %>%
group_by(dteday) %>% summarise(registered=sum(registered),casual=sum(casual)) %>%
pivot_longer(cols=c(registered,casual),names_to="RideType",values_to="NumberOfRides")
fig<-ggplot(data = plot_data) +
geom_line(mapping = aes(x=dteday,y=NumberOfRides,color=RideType))+
ggtitle("Number of Riders by day")+
xlab("Date") +
ylab("Number of Riders") +
theme_classic()+
theme(
legend.position = c(1, 1),
legend.justification = c("right", "top"),
legend.box.just = "right",
legend.margin = margin(0, 0, 0, 0),
legend.title = element_blank()
) +
theme(plot.title = element_text(hjust = 0.5))
fig
ggsave("./figs/rides_daily.png",plot = fig,device = "png")
rm(plot_data)
PLotting Time Series with Rolling means and standard deviations
windowsize <- 7
plot_data<- bikesharing %>% select(dteday,registered,casual) %>%
group_by(dteday) %>%
summarise(sumreg=sum(registered),sumcas=sum(casual)) %>%
mutate(regrollm=rollmean(x=sumreg,k=windowsize,align="right",fill=NA),
casrollm=rollmean(x=sumcas,k=windowsize,align="right",fill=NA),
regrollsd=rollapplyr(data= sumreg,width=windowsize,FUN="sd",fill=NA),
casrollsd=rollapplyr(data= sumcas,width=windowsize,FUN="sd",fill=NA)
) %>% select(dteday,registered=regrollm,registeredsd=regrollsd,casual=casrollm,casualsd=casrollsd) %>% filter(!is.na(registered))
plot_data_1<- plot_data %>% select(dteday,registered,casual) %>%
pivot_longer(cols = c("registered","casual"),names_to="RideType",values_to="RollingMeanRides")
plot_data_2<- plot_data %>% select(dteday,registeredsd,casualsd) %>%
pivot_longer(cols = c("registeredsd","casualsd"),names_to="RideType",values_to="RollingsdRides")
fig <- ggplot(data = plot_data_1) + geom_line(mapping = aes(x=dteday,y=RollingMeanRides,color=RideType)) +
geom_ribbon(data=plot_data_2,mapping = aes(x=dteday,ymin=plot_data_1$RollingMeanRides-2*RollingsdRides,ymax=plot_data_1$RollingMeanRides+2*RollingsdRides,fill=RideType),alpha=0.25,show.legend = FALSE) +
ggtitle("Number of Readers by Day",subtitle = "Smoothed by Rolling Average of 7 days")+
ylab("Number of Riders")+
xlab("Date") +
theme_classic()+
theme(
legend.position = c(1, 1),
legend.justification = c("right", "top"),
legend.box.just = "right",
legend.margin = margin(0, 0, 0, 0),
legend.title = element_blank()
)+
theme(plot.title = element_text(hjust = 0.5)) +
theme(plot.subtitle = element_text(hjust = 0.58))
fig
ggsave("./figs/rolling_rides_daily.png",plot = fig,device = "png")
rm(list=c("plot_data_1","plot_data_2"))
We will draw both the total and the average of riders by hour of the day by season.
plot_data<-bikesharing %>% pivot_longer(cols =c(registered,casual),names_to="RideType",values_to="NumberOfRides") %>%
select(hr,season,RideType,NumberOfRides) %>% mutate(season=factor(season,levels=c("winter","spring","summer","fall")),RideType=factor(RideType,levels=c("registered","casual")), hr=factor(hr))
fig<-ggplot(data = plot_data) +
geom_col(mapping = aes(x=hr,y=NumberOfRides),fill="lightblue")+
scale_x_discrete(breaks=seq(0,23))+
facet_grid(rows = vars(season),cols = vars(RideType),switch = "y") +
ggtitle("Hourly Distribution of Rides by Season") +
xlab("Hour of the Day")+
ylab("Number of Rides")+
theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
fig
ggsave("./figs/totalregisteredVsCasualridersbyhourbyseason.png",plot = fig,device = "png")
rm(plot_data)
plot_data_average <-
bikesharing %>% group_by(hr,season) %>% summarise(registered=mean(registered),casual=mean(casual)) %>%
pivot_longer(cols =c(registered,casual),names_to="RideType",values_to="AverageRides") %>%
select(hr,season,RideType,AverageRides) %>%
mutate(season=factor(season,levels=c("winter","spring","summer","fall"))
,RideType=factor(RideType,levels=c("registered","casual"))
, hr=factor(hr))
fig<-ggplot(data = plot_data_average) +
geom_col(mapping = aes(x=hr,y=AverageRides),fill="lightblue")+
scale_x_discrete( breaks=seq(0,23))+
facet_grid(rows = vars(season),cols = vars(RideType),switch = "y") +
ggtitle("Hourly distribution of Average Number of Rides by Season")+
ylab("Average Number of Rides")+
xlab("Hour of the Day") +
theme_classic()
fig
ggsave("./figs/AverageregisteredVsCasualridersbyhourbyseason.png",plot = fig,device = "png")
rm(plot_data_average)
We will check for distribution of rides by day of week by season
plot_data<-bikesharing %>%
pivot_longer(cols =c(registered,casual),names_to="RideType",values_to="NumberOfRides") %>%
select(weekday,season,RideType,NumberOfRides) %>%
mutate(
season=factor(season,levels=c("winter","spring","summer","fall"))
,RideType=factor(RideType,levels=c("registered","casual"))
,weekday=factor(weekday,levels = c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"))
)
fig<-ggplot(data = plot_data) +
geom_col(mapping = aes(x=weekday,y=NumberOfRides),fill="lightblue") +
facet_grid(rows = vars(season),cols = vars(RideType)) +
scale_y_continuous(labels=function(n){format(n, scientific = FALSE)})+
scale_x_discrete(guide = guide_axis(n.dodge = 2))+
ggtitle("Weekly Rides by Season")+
xlab("Day of the Week")+
ylab("Total Number of Rides")+
theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
plot(x=fig)
ggsave("./figs/NumRidesBySeason.png",device = "png",plot = fig)
rm(plot_data)
Now the plot of Average Number of Rides by Weekday and Season
plot_data <- bikesharing %>% group_by(weekday,season) %>%
summarise(registered=mean(registered),casual=mean(casual)) %>%
pivot_longer(cols = c(registered,casual),names_to="RideType",values_to="AverageRides") %>%
mutate(season=factor(season,levels=c("winter","spring","summer","fall"))
,RideType=factor(RideType,levels=c("registered","casual"))
,weekday=factor(weekday,levels = c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"))
)
fig <- ggplot(data = plot_data) +
geom_col(mapping = aes(x=weekday,y=AverageRides),fill="lightblue") +
facet_grid(rows = vars(season),cols = vars(RideType)) +
scale_y_continuous(labels=function(n){format(n, scientific = FALSE)})+
ggtitle("Average Rides by Day of the Week and Season") +
xlab("Day of the Week") +
ylab("Average Number of Rides") +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
plot(fig)
ggsave("./figs/AverageRidesByDaysofWeekAndSeason.png",device = "png",plot = fig)
rm(plot_data)
We will be using statistical tests to determine whether 2 groups are different. We will be testing whether 1. There is a difference between Registered Riders on the weekend vs weekdays 2. There is a difference between Casual Riders on the weekend vs weekdays 3. Is there a difference between Registered riders between various weekdays 4. Is there a difference between Casual riders between various weekdays
ttestData<-bikesharing %>%
select(weekday,registered,casual) %>%
mutate(IsWeekDay=case_when(
weekday %in% c("Saturday","Sunday") ~ "No",
TRUE~"Yes"
)) %>%
select(IsWeekDay,casual,registered) %>%
pivot_longer(cols=c(registered,casual),names_to="RideType",values_to="NumRides")
testResults<-ttestData %>%
group_by(RideType,IsWeekDay) %>%
nest() %>%
pivot_wider(names_from = IsWeekDay,values_from=data) %>%
mutate(
t_test = map2(No, Yes, ~{t.test(.x$NumRides, .y$NumRides) %>% tidy()})
) %>%
unnest(c(t_test)) %>%
select(RideType,statistic,p.value)
testResults
## # A tibble: 2 x 3
## # Groups: RideType [2]
## RideType statistic p.value
## <chr> <dbl> <dbl>
## 1 registered -19.0 1.51e- 79
## 2 casual 30.4 1.70e-188
rm(ttestData,testResults)
Looking at the p values and at .05 Confidence Level we can safely reject the NULL Hypothesis that means are identical. The alternative hypothesis is that the means are different . We can conclude that based on the data there is a difference between the number of users across categories between weekdays and weekends.
I will get back with differences of users between weekdays. Given this is count data how do we fit other methods to do the test.
plot_data<-bikesharing %>%
pivot_longer(cols=c("registered","casual"),names_to="RideType",values_to="NumRides") %>%
select(temp,atemp,hum,windspeed,RideType,NumRides)
plotscatters<-function(colname,busname)
{
coltoplot<-ensym(colname)
fig<-ggplot(data = plot_data) +
geom_point(mapping = aes(x=!!coltoplot,y=NumRides,color=RideType),alpha=.25) +
geom_smooth(mapping = aes(x=!!coltoplot,y=NumRides,color=RideType),method = "lm",alpha=.25) +
ggtitle(paste("Plot of Rides to", busname)) +
theme_classic()+
theme(plot.title = element_text(hjust = 0.5))+
theme(legend.title = element_blank())+
ylab("Number of Rides") +
xlab(busname)
ggsave(paste("./figs/",busname,".png"),device = "png",plot = fig)
return(fig)
}
varlis<-c("temp","atemp","hum","windspeed")
buslis<-c("Percieved Temperature","Actual Temperature","Humidity","Wind Speed")
figlist<-map2(varlis,buslis,plotscatters)
for (figs in figlist) {
plot(figs)
}
rm(plot_data)